In [1]:
import sys
import logging
In [2]:
# Import the GEM-PRO class
from ssbio.pipeline.gempro import GEMPRO
In [3]:
# Printing multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Set the logging level in logger.setLevel(logging.<LEVEL_HERE>)
to specify how verbose you want the pipeline to be. Debug is most verbose.
CRITICAL
ERROR
WARNING
INFO
(default)DEBUG
In [4]:
# Create logger
logger = logging.getLogger()
logger.setLevel(logging.INFO) # SET YOUR LOGGING LEVEL HERE #
In [5]:
# Other logger stuff for Jupyter notebooks
handler = logging.StreamHandler(sys.stderr)
formatter = logging.Formatter('[%(asctime)s] [%(name)s] %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M")
handler.setFormatter(formatter)
logger.handlers = [handler]
Set these three things:
ROOT_DIR
PROJECT
will be createdPROJECT
LIST_OF_GENES
A directory will be created in ROOT_DIR
with your PROJECT
name. The folders are organized like so:
ROOT_DIR
└── PROJECT
├── data # General storage for pipeline outputs
├── model # SBML and GEM-PRO models are stored here
├── genes # Per gene information
│ ├── <gene_id1> # Specific gene directory
│ │ └── protein
│ │ ├── sequences # Protein sequence files, alignments, etc.
│ │ └── structures # Protein structure files, calculations, etc.
│ └── <gene_id2>
│ └── protein
│ ├── sequences
│ └── structures
├── reactions # Per reaction information
│ └── <reaction_id1> # Specific reaction directory
│ └── complex
│ └── structures # Protein complex files
└── metabolites # Per metabolite information
└── <metabolite_id1> # Specific metabolite directory
└── chemical
└── structures # Metabolite 2D and 3D structure files
In [6]:
# SET FOLDERS AND DATA HERE
import tempfile
ROOT_DIR = tempfile.gettempdir()
PROJECT = 'mtuberculosis_gp'
GEM_FILE = '../../ssbio/test/test_files/models/iNJ661.json'
GEM_FILE_TYPE = 'json'
PDB_FILE_TYPE = 'mmtf'
In [7]:
# Create the GEM-PRO project
my_gempro = GEMPRO(gem_name=PROJECT, root_dir=ROOT_DIR, gem_file_path=GEM_FILE, gem_file_type=GEM_FILE_TYPE, pdb_file_type=PDB_FILE_TYPE)
First, we need to map these IDs to their protein sequences. There are 2 ID mapping services provided to do this - through KEGG or UniProt. The end goal is to map a UniProt ID to each ID, since there is a comprehensive mapping (and some useful APIs) between UniProt and the PDB.
However, you don't need to map using these services if you already have the amino acid sequences for each protein. You can just manually load in the sequences as shown using the method manual_seq_mapping
. Or, if you already have the UniProt IDs, you can load those in using the method manual_uniprot_mapping
.
In [8]:
gene_to_seq_dict = {'Rv1295': 'MTVPPTATHQPWPGVIAAYRDRLPVGDDWTPVTLLEGGTPLIAATNLSKQTGCTIHLKVEGLNPTGSFKDRGMTMAVTDALAHGQRAVLCASTGNTSASAAAYAARAGITCAVLIPQGKIAMGKLAQAVMHGAKIIQIDGNFDDCLELARKMAADFPTISLVNSVNPVRIEGQKTAAFEIVDVLGTAPDVHALPVGNAGNITAYWKGYTEYHQLGLIDKLPRMLGTQAAGAAPLVLGEPVSHPETIATAIRIGSPASWTSAVEAQQQSKGRFLAASDEEILAAYHLVARVEGVFVEPASAASIAGLLKAIDDGWVARGSTVVCTVTGNGLKDPDTALKDMPSVSPVPVDPVAVVEKLGLA',
'Rv2233': 'VSSPRERRPASQAPRLSRRPPAHQTSRSSPDTTAPTGSGLSNRFVNDNGIVTDTTASGTNCPPPPRAAARRASSPGESPQLVIFDLDGTLTDSARGIVSSFRHALNHIGAPVPEGDLATHIVGPPMHETLRAMGLGESAEEAIVAYRADYSARGWAMNSLFDGIGPLLADLRTAGVRLAVATSKAEPTARRILRHFGIEQHFEVIAGASTDGSRGSKVDVLAHALAQLRPLPERLVMVGDRSHDVDGAAAHGIDTVVVGWGYGRADFIDKTSTTVVTHAATIDELREALGV'}
my_gempro.manual_seq_mapping(gene_to_seq_dict)
In [9]:
manual_uniprot_dict = {'Rv1755c': 'P9WIA9', 'Rv2321c': 'P71891', 'Rv0619': 'Q79FY3', 'Rv0618': 'Q79FY4', 'Rv2322c': 'P71890'}
my_gempro.manual_uniprot_mapping(manual_uniprot_dict)
my_gempro.df_uniprot_metadata.tail(4)
Out[9]:
In [10]:
# KEGG mapping of gene ids
my_gempro.kegg_mapping_and_metadata(kegg_organism_code='mtu')
print('Missing KEGG mapping: ', my_gempro.missing_kegg_mapping)
my_gempro.df_kegg_metadata.head()
Out[10]:
In [11]:
# UniProt mapping
my_gempro.uniprot_mapping_and_metadata(model_gene_source='TUBERCULIST_ID')
print('Missing UniProt mapping: ', my_gempro.missing_uniprot_mapping)
my_gempro.df_uniprot_metadata.head()
Out[11]:
If you have mapped with both KEGG and UniProt mappers, then you can set a representative sequence for the gene using this function. If you used just one, this will just set that ID as representative.
In [12]:
# Set representative sequences
my_gempro.set_representative_sequence()
print('Missing a representative sequence: ', my_gempro.missing_representative_sequence)
my_gempro.df_representative_sequences.head()
Out[12]:
These are the ways to map sequence to structure:
You can only utilize option #1 to map to PDBs if there is a mapped UniProt ID set in the representative sequence. If not, you'll have to BLAST your sequence to the PDB or make a homology model. You can also run both for maximum coverage.
In [13]:
# Mapping using the PDBe best_structures service
my_gempro.map_uniprot_to_pdb(seq_ident_cutoff=.3)
my_gempro.df_pdb_ranking.head()
Out[13]:
In [14]:
# Mapping using BLAST
my_gempro.blast_seqs_to_pdb(all_genes=True, seq_ident_cutoff=.9, evalue=0.00001)
my_gempro.df_pdb_blast.head(2)
Out[14]:
In [15]:
tb_homology_dir = '/home/nathan/projects_archive/homology_models/MTUBERCULOSIS/'
##### EXAMPLE SPECIFIC CODE #####
# Needed to map to older IDs used in this example
import pandas as pd
import os.path as op
old_gene_to_homology = pd.read_csv(op.join(tb_homology_dir, 'data/161031-old_gene_to_uniprot_mapping.csv'))
gene_to_uniprot = old_gene_to_homology.set_index('m_gene').to_dict()['u_uniprot_acc']
my_gempro.get_itasser_models(homology_raw_dir=op.join(tb_homology_dir, 'raw'), custom_itasser_name_mapping=gene_to_uniprot)
### END EXAMPLE SPECIFIC CODE ###
# Organizing I-TASSER homology models
my_gempro.get_itasser_models(homology_raw_dir=op.join(tb_homology_dir, 'raw'))
my_gempro.df_homology_models.head()
Out[15]:
In [16]:
homology_model_dict = {}
my_gempro.get_manual_homology_models(homology_model_dict)
In [ ]:
# Download all mapped PDBs and gather the metadata
my_gempro.download_all_pdbs()
my_gempro.df_pdb_metadata.head(2)
In [17]:
# Set representative structures
my_gempro.set_representative_structure()
my_gempro.df_representative_structures.head()
Out[17]:
In [18]:
# Looking at the information saved within a gene
my_gempro.genes.get_by_id('Rv1295').protein.representative_structure
my_gempro.genes.get_by_id('Rv1295').protein.representative_structure.get_dict()
Out[18]:
Out[18]:
For those proteins with no representative structure, we can create homology models for them. ssbio
contains some built in functions for easily running I-TASSER locally or on machines with SLURM
(ie. on NERSC) or Torque
job scheduling.
You can load in I-TASSER models once they complete using the get_itasser_models
later.
In [19]:
# Prep I-TASSER model folders
my_gempro.prep_itasser_modeling('~/software/I-TASSER4.4', '~/software/ITLIB/', runtype='local', all_genes=False)
Finally, you can save your GEM-PRO as a JSON
or pickle
file, so you don't have to run the pipeline again.
For most functions, if you rerun them, they will check for existing results saved as files. The only function that would take a long time is setting the representative structure, as they are each rechecked and cleaned. This is where saving helps!
In [20]:
import os.path as op
my_gempro.save_pickle(op.join(my_gempro.model_dir, '{}.pckl'.format(my_gempro.id)))
In [21]:
import os.path as op
my_gempro.save_json(op.join(my_gempro.model_dir, '{}.json'.format(my_gempro.id)), compression=False)
In [ ]:
# Loading a pickle file
import pickle
with open('/tmp/mtuberculosis_gp_atlas/model/mtuberculosis_gp_atlas.pckl', 'rb') as f:
my_saved_gempro = pickle.load(f)
In [ ]:
# Loading a JSON file
import ssbio.core.io
my_saved_gempro = ssbio.core.io.load_json('/tmp/mtuberculosis_gp_atlas/model/mtuberculosis_gp_atlas.json', decompression=False)